package ivory.core.tokenize; import ivory.core.Constants; import java.io.IOException; import java.io.StringReader; import java.text.Normalizer; import java.text.Normalizer.Form; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.ar.ArabicNormalizationFilter; import org.apache.lucene.analysis.ar.ArabicStemFilter; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.util.Version; import edu.umd.hooka.VocabularyWritable; import edu.umd.hooka.alignment.HadoopAlign; public class LuceneArabicAnalyzer extends ivory.core.tokenize.Tokenizer { private static final Logger LOG = Logger.getLogger(LuceneArabicAnalyzer.class); static{ LOG.setLevel(Level.WARN); } private org.apache.lucene.analysis.Tokenizer tokenizer; @Override public void configure(Configuration conf) { configure(conf, null); } @Override public void configure(Configuration conf, FileSystem fs) { // read stopwords from file (stopwords will be empty set if file does not exist or is empty) String stopwordsFile = conf.get(Constants.StopwordList); stopwords = readInput(fs, stopwordsFile); String stemmedStopwordsFile = conf.get(Constants.StemmedStopwordList); stemmedStopwords = readInput(fs, stemmedStopwordsFile); isStopwordRemoval = !stopwords.isEmpty(); isStemming = conf.getBoolean(Constants.Stemming, true); VocabularyWritable vocab; try { vocab = (VocabularyWritable) HadoopAlign.loadVocab(new Path(conf.get(Constants.CollectionVocab)), fs); setVocab(vocab); } catch (Exception e) { LOG.warn("No vocabulary provided to tokenizer."); vocab = null; } LOG.warn("Stemming is " + isStemming + "; Stopword removal is " + isStopwordRemoval +"; number of stopwords: " + stopwords.size() +"; stemmed: " + stemmedStopwords.size()); } @Override public String[] processContent(String text) { text = preNormalize(text); tokenizer = new StandardTokenizer(Version.LUCENE_35, new StringReader(text)); TokenStream tokenStream = new LowerCaseFilter(Version.LUCENE_35, tokenizer); String tokenized = postNormalize(streamToString(tokenStream)); tokenized = Normalizer.normalize(tokenized, Form.NFKC); StringBuilder finalTokenized = new StringBuilder(); for (String token : tokenized.split(" ")) { if ( isStopwordRemoval() && isDiscard(false, token) ) { continue; } finalTokenized.append( token + " " ); } String stemmedTokenized = finalTokenized.toString().trim(); if (isStemming()) { // then, run the Lucene normalization and stemming on the stopword-removed text stemmedTokenized = stem(stemmedTokenized); } return stemmedTokenized.split(" "); } @Override public String stem(String token) { tokenizer = new StandardTokenizer(Version.LUCENE_35, new StringReader(token)); TokenStream tokenStream = new ArabicStemFilter(new ArabicNormalizationFilter(tokenizer)); CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.clearAttributes(); StringBuilder stemmed = new StringBuilder(); try { while (tokenStream.incrementToken()) { String curToken = termAtt.toString(); if ( vocab != null && vocab.get(curToken) <= 0) { continue; } stemmed.append( curToken + " " ); } }catch (IOException e) { e.printStackTrace(); } return stemmed.toString().trim(); } @Override public float getOOVRate(String text, VocabularyWritable vocab) { int countOOV = 0, countAll = 0; tokenizer = new StandardTokenizer(Version.LUCENE_35, new StringReader(text)); TokenStream tokenStream = new StandardFilter(Version.LUCENE_35, tokenizer); tokenStream = new LowerCaseFilter(Version.LUCENE_35, tokenStream); String tokenized = postNormalize(streamToString(tokenStream)); StringBuilder finalTokenized = new StringBuilder(); for (String token : tokenized.split(" ")) { if ( isStopwordRemoval() && isDiscard(false, token) ) { continue; } if (!isStemming()) { if ( vocab != null && vocab.get(token) <= 0) { countOOV++; } countAll++; }else { finalTokenized.append( token + " " ); } } if (isStemming()) { tokenizer = new StandardTokenizer(Version.LUCENE_35, new StringReader(finalTokenized.toString().trim())); tokenStream = new ArabicStemFilter(new ArabicNormalizationFilter(tokenizer)); CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.clearAttributes(); try { while (tokenStream.incrementToken()) { String curToken = termAtt.toString(); if ( vocab != null && vocab.get(curToken) <= 0) { countOOV++; } countAll++; } } catch (IOException e) { e.printStackTrace(); } } return (countOOV / (float) countAll); } }